import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
from utils import *
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 100)
matplotlib.rcParams.update({"font.size": 16,'lines.linewidth': 2.5})
# matplotlib.rcdefaults()
figures saved to ./tmp/figures
DATA_DIR = '../data/'
dfj = get_df(DATA_DIR + 'pai_job_table.csv')
dft = get_df(DATA_DIR + 'pai_task_table.csv')
dfi = get_df(DATA_DIR + 'pai_instance_table.csv')
dfg = get_df(DATA_DIR + 'pai_group_tag_table.csv')
dfa = get_dfa(dft, dfj, dfi, dfg)
dft + dfj ... dft + dfj + dfi ... dft + dfj + dfi + dfg ...
data_df = dfa
data_df_1 = data_df[(data_df.plan_gpu.isnull() == False)]
misc_task_useage = data_df_1[data_df_1.gpu_type == "MISC"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "MISC"])
p100_task_useage = data_df_1[data_df_1.gpu_type == "P100"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "P100"])
t4_task_useage = data_df_1[data_df_1.gpu_type == "T4"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "T4"])
v100_task_useage = data_df_1[data_df_1.gpu_type == "V100"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "V100"])
v100m32_task_useage = data_df_1[data_df_1.gpu_type == "V100M32"].groupby('task_name').count()/len(data_df_1[data_df_1.gpu_type == "V100M32"])
gpu_task_useage = pd.concat([misc_task_useage.job_name,
p100_task_useage.job_name,
t4_task_useage.job_name,
v100_task_useage.job_name,
v100m32_task_useage.job_name], axis=1)
gpu_task_useage.columns = ['MISC_task_useage', 'P100_task_useage', 'T4_task_useage', 'V100_task_useage', 'V100M32_task_useage']
gpu_task_useage = gpu_task_useage.fillna(0)
gpu_task_useage.reset_index(inplace=True)
gpu_task_useage_melt = pd.melt(gpu_task_useage, id_vars=["task_name"], value_name='percentage')
fig = px.bar(gpu_task_useage_melt, x=gpu_task_useage_melt.percentage, y = gpu_task_useage_melt.task_name, width=1000, height=700, color='variable', barmode='group')
fig.show()
data_df_2 = data_df_1
# plan non shared:
data_df_2_shared = data_df_2[data_df_2.plan_gpu % 100 > 0]
# plan shared:
data_df_2_non_shared = data_df_2[data_df_2.plan_gpu % 100 == 0]
data_df_2_shared.shared = True
data_df_2_shared.assign(Name='shared')
data_df_2_shared["shared"] = True
data_df_2_non_shared.assign(Name='shared')
data_df_2_non_shared["shared"] = False
data_df_2 = pd.concat([data_df_2_shared, data_df_2_non_shared], ignore_index=True)
C:\Users\lhtMi\AppData\Local\Temp\ipykernel_24908\1696230573.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\lhtMi\AppData\Local\Temp\ipykernel_24908\1696230573.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
temp = data_df_2.groupby(['task_name', 'shared']).count().reset_index()
temp.job_name = temp.job_name/temp.job_name.sum()
fig = px.histogram(temp, x="task_name", y="job_name",
color='shared', barmode='group',
height=400)
fig.update_layout(
title="Percentage of total tasks being shared by each task type",
xaxis_title="Task names",
yaxis_title="Percentage of total tasks",
legend_title="Using Shared GPU"
)
fig.show()
# "Percentage of GPU being shared"
temp = data_df_2.groupby(['gpu_type', 'shared']).count().reset_index()
temp.job_name = temp.job_name/temp.job_name.sum()
fig = px.histogram(temp, x="gpu_type", y="job_name",
color='shared', barmode='group',
height=400)
fig.update_layout(
title="Percentage of total tasks being shared by each GPU type",
xaxis_title="GPU type",
yaxis_title="Percentage of total tasks",
legend_title="Using Shared GPU"
)
fig.show()
Note that this plot only works under Jupyter Notebook or Jupyter Lab
# "Percentage of GPU being shared"
def plot_percentage_shared_by_GPU(gpu_name = "MISC"):
data_df_2.groupby(['gpu_type', "task_name", 'shared'])
temp = data_df_2.groupby(['gpu_type', "task_name", 'shared']).count().reset_index()
temp.job_name = temp.job_name/temp.job_name.sum()
fig = px.histogram(temp[temp.gpu_type == gpu_name], x="shared", y="job_name",
color='task_name',
height=400, width=400)
fig.update_layout(
title="Percentage of total tasks on " + gpu_name,
xaxis_title="Using Shared GPU",
yaxis_title="Percentage of total tasks",
legend_title="Task name"
)
fig.show()
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
v = interact(plot_percentage_shared_by_GPU, gpu_name=['T4','MISC','P100','V100','V100M32'])
display(v)
interactive(children=(Dropdown(description='gpu_name', index=1, options=('T4', 'MISC', 'P100', 'V100', 'V100M3…
<function __main__.plot_percentage_shared_by_GPU(gpu_name='MISC')>
df_summarize_resource = data_df_2.groupby(['plan_gpu','plan_cpu', 'plan_mem', 'task_name']).count()
df_summarize_resource = df_summarize_resource.drop(columns=['inst_num', 'status', 'start_time', 'end_time', 'gpu_type',
'inst_id', 'user', 'status_j', 'start_time_j', 'end_time_j', 'runtime',
'status_i', 'start_time_i', 'end_time_i', 'runtime_i', 'duration_min',
'wait_time', 'start_date', 'gpu_type_spec', 'group', 'workload',
'shared'])
df_summarize_resource = df_summarize_resource.reset_index()
df_summarize_resource = df_summarize_resource.rename(columns={"job_name": "count"})
def set_marker_size(x):
x = x["count"]
if x < 100:
return 1
elif x < 500:
return 2
elif x < 1000:
return 3
elif x < 2000:
return 4
else:
return 5
df_summarize_resource['marker_size'] = df_summarize_resource.apply(lambda x: set_marker_size(x), axis=1)
fig = px.scatter_3d(df_summarize_resource, x='plan_gpu', y='plan_cpu', z='plan_mem',
color='task_name', hover_data=["count"], opacity=0.7, size = "marker_size", width=1200, height=800)
fig.show()
# by GPU type
df_summarize_resource_gpu = data_df_2.groupby(['plan_gpu','plan_cpu', 'plan_mem', 'gpu_type']).count()
df_summarize_resource_gpu = df_summarize_resource_gpu.drop(columns=['inst_num', 'status', 'start_time', 'end_time',
'inst_id', 'user', 'status_j', 'start_time_j', 'end_time_j', 'runtime',
'status_i', 'start_time_i', 'end_time_i', 'runtime_i', 'duration_min',
'wait_time', 'start_date', 'gpu_type_spec', 'group', 'workload',
'shared'])
df_summarize_resource_gpu = df_summarize_resource_gpu.reset_index()
df_summarize_resource_gpu = df_summarize_resource_gpu.rename(columns={"job_name": "count"})
df_summarize_resource_gpu['marker_size'] = df_summarize_resource_gpu.apply(lambda x: set_marker_size(x), axis=1)
fig = px.scatter_3d(df_summarize_resource_gpu, x='plan_gpu', y='plan_cpu', z='plan_mem',
color='gpu_type', hover_data=["count"], opacity=0.7, size = "marker_size", width=1200, height=800)
fig.show()
We preprocess the data by selecting all the planned resources and the hour task initiated. We observed some wait times being negative, which is not possible, so we removed those rows with negative wait times. Then we drop NA and split the data for training.
data_df.wait_time.describe()
count 1.165653e+06 mean 3.562137e+02 std 5.199093e+03 min -5.134800e+04 25% 3.000000e+00 50% 8.000000e+00 75% 1.300000e+01 max 5.998170e+05 Name: wait_time, dtype: float64
from sklearn.model_selection import train_test_split
# Preprocess data
df = data_df
df = df[df.wait_time >= 0]
# Only use completed tasks
df = df[df.status == "Terminated"]
df = add_hour_date(df)
# Select columns that related to wait time
df_train = df[["task_name", "plan_cpu", "plan_mem", "plan_gpu", "gpu_type", "hour", "wait_time"]]
df_train = df_train.dropna()
df_train = pd.get_dummies(data=df_train, drop_first=True)
# Out put wait time
Y = df_train['wait_time']
# Input
X = df_train.drop(['wait_time'], axis=1)
We would like first to try the linear regression model to estimate the wait time. Because linear regression is simple to understand, and based on our exploration, some variables seems to correlate with the score linearly, so we would like to try the linear regression model first.
# Train the model
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
model = LinearRegression()
model.fit(X_train,y_train)
LinearRegression()
The plot of regression line
from sklearn.metrics import mean_squared_error
predictions1 = model.predict(X_test)
g = sns.regplot(x=list(y_test),y=predictions1)
g.set(xlabel='True wait time')
g.set(ylabel='Predicted wait time')
[Text(0, 0.5, 'Predicted wait time')]
# RMSE
mean_squared_error(y_test, predictions1, squared=False)
4010.3750400876725
The second model we want to try is RandomForest. RandomForest fits several classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. We chose this algorithm because there could have some non-linearity between each variable and the wait time, and the decision tree could capture those non-linearities. Further, random forests could reduce variance by training on different samples of the data.
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
model = RandomForestRegressor()
model.fit(X_train,y_train)
RandomForestRegressor()
from sklearn.metrics import mean_squared_error
predictions2 = model.predict(X_test)
g = sns.regplot(x=list(y_test),y=predictions2)
g.set(xlabel='True wait time')
g.set(ylabel='Predicted wait time')
[Text(0, 0.5, 'Predicted wait time')]
# RMSE
mean_squared_error(y_test, predictions2, squared=False)
4072.746099120997